In [1]:
import os
import warnings
from dotenv import load_dotenv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.exceptions import UndefinedMetricWarning

from evidently import Report, Dataset, DataDefinition, Regression
from evidently.presets import DataDriftPreset, RegressionPreset
from evidently.ui.workspace import CloudWorkspace

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
In [2]:
# Set up Evidently cloud
load_dotenv() 
org_id = os.getenv("EVIDENTLY_ORG_ID") 
api_key = os.getenv("EVIDENTLY_API_KEY") 
project_id = os.getenv("EVIDENTLY_PROJECT_ID") 

ws = CloudWorkspace(token=api_key, url="https://app.evidently.cloud")
project = ws.get_project(project_id)
In [3]:
df = pd.read_csv('cancer_reg.csv', encoding='latin1')
print(df.shape)
df.head()
(3047, 34)
Out[3]:
avgAnnCount avgDeathsPerYear TARGET_deathRate incidenceRate medIncome popEst2015 povertyPercent studyPerCap binnedInc MedianAge MedianAgeMale MedianAgeFemale Geography AvgHouseholdSize PercentMarried PctNoHS18_24 PctHS18_24 PctSomeCol18_24 PctBachDeg18_24 PctHS25_Over PctBachDeg25_Over PctEmployed16_Over PctUnemployed16_Over PctPrivateCoverage PctPrivateCoverageAlone PctEmpPrivCoverage PctPublicCoverage PctPublicCoverageAlone PctWhite PctBlack PctAsian PctOtherRace PctMarriedHouseholds BirthRate
0 1397.0 469 164.9 489.8 61898 260131 11.2 499.748204 (61494.5, 125635] 39.3 36.9 41.7 Kitsap County, Washington 2.54 52.5 11.5 39.5 42.1 6.9 23.2 19.6 51.9 8.0 75.1 NaN 41.6 32.9 14.0 81.780529 2.594728 4.821857 1.843479 52.856076 6.118831
1 173.0 70 161.3 411.6 48127 43269 18.6 23.111234 (48021.6, 51046.4] 33.0 32.2 33.7 Kittitas County, Washington 2.34 44.5 6.1 22.4 64.0 7.5 26.0 22.7 55.9 7.8 70.2 53.8 43.6 31.1 15.3 89.228509 0.969102 2.246233 3.741352 45.372500 4.333096
2 102.0 50 174.7 349.7 49348 21026 14.6 47.560164 (48021.6, 51046.4] 45.0 44.0 45.8 Klickitat County, Washington 2.62 54.2 24.0 36.6 NaN 9.5 29.0 16.0 45.9 7.0 63.7 43.5 34.9 42.1 21.1 90.922190 0.739673 0.465898 2.747358 54.444868 3.729488
3 427.0 202 194.8 430.4 44243 75882 17.1 342.637253 (42724.4, 45201] 42.8 42.2 43.4 Lewis County, Washington 2.52 52.7 20.2 41.2 36.1 2.5 31.6 9.3 48.3 12.1 58.4 40.3 35.0 45.3 25.0 91.744686 0.782626 1.161359 1.362643 51.021514 4.603841
4 57.0 26 144.4 350.1 49955 10321 12.5 0.000000 (48021.6, 51046.4] 48.3 47.8 48.9 Lincoln County, Washington 2.34 57.8 14.9 43.0 40.0 2.0 33.4 15.0 48.2 4.8 61.6 43.9 35.1 44.0 22.7 94.104024 0.270192 0.665830 0.492135 54.027460 6.796657
In [4]:
# Preprocess data
target = "TARGET_deathRate"
features = [c for c in df.columns if c != target and c not in ["Geography", "binnedInc"]]
# Split data
train, test = train_test_split(df, test_size=0.2, random_state=42)
X_train, y_train = train[features], train[target]
X_test_orig, y_test_orig = test[features], test[target]

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate on original test set
pred_test_orig = model.predict(X_test_orig)
import numpy as np
print("Original Test RMSE:", np.sqrt(mean_squared_error(y_test_orig, pred_test_orig)))
print("Original Test R²:", r2_score(y_test_orig, pred_test_orig))
Original Test RMSE: 19.141634432544688
Original Test R²: 0.5522098578423327
In [5]:
# Helper to evaluate and report results
def evaluate_and_report(name, X_cur, y_true):
    preds = model.predict(X_cur)

    ref = train[features].copy()
    ref[target] = train[target]
    ref["prediction"] = model.predict(train[features])

    cur = X_cur.copy()
    cur[target] = y_true
    cur["prediction"] = preds

    data_def = DataDefinition(
        regression=[Regression(target=target, prediction="prediction")]
    )

    ref_ds = Dataset.from_pandas(ref, data_definition=data_def)
    cur_ds = Dataset.from_pandas(cur, data_definition=data_def)

    rmse = np.sqrt(mean_squared_error(y_true, preds))
    r2 = r2_score(y_true, preds) if len(y_true) >= 2 else float("nan")
    print(f"\n== {name} ==")
    print("RMSE:", rmse, "R²:", r2)

    # Run evaluation
    report = Report(metrics=[DataDriftPreset(), RegressionPreset()])
    eval_result = report.run(reference_data=ref_ds, current_data=cur_ds)

    # Upload to Cloud
    ws.add_run(project.id, eval_result, include_data=False)

    return eval_result

evaluate_and_report("Baseline (Original Test)", X_test_orig, y_test_orig)
== Baseline (Original Test) ==
RMSE: 19.141634432544688 R²: 0.5522098578423327
Out[5]:
In [6]:
# Create Modified Scenarios (A, A+B, A+B+C)
# Scenario A: Decrease median income by 40k
test_A = X_test_orig.copy()
test_A["medIncome"] = test_A["medIncome"] - 40000

# Scenario A+B: Also increase povertyPercent by 20
test_AB = test_A.copy()
test_AB["povertyPercent"] = test_AB["povertyPercent"] + 20

# Scenario A+B+C: Also increase AvgHouseholdSize by 2
test_ABC = test_AB.copy()
test_ABC["AvgHouseholdSize"] = test_ABC["AvgHouseholdSize"] + 2
In [7]:
evaluate_and_report("Scenario A (Income ↓40k)", test_A, y_test_orig)
== Scenario A (Income ↓40k) ==
RMSE: 21.5767213114231 R²: 0.43103258174724723
Out[7]:
In [8]:
evaluate_and_report("Scenario A+B (Income ↓, Poverty ↑)", test_AB, y_test_orig)
== Scenario A+B (Income ↓, Poverty ↑) ==
RMSE: 23.02972891944842 R²: 0.35182221121125634
Out[8]:
In [9]:
evaluate_and_report("Scenario A+B+C (Income ↓, Poverty ↑, HH Size ↑)", test_ABC, y_test_orig)
== Scenario A+B+C (Income ↓, Poverty ↑, HH Size ↑) ==
RMSE: 22.529505067326458 R²: 0.37967426997046405
Out[9]:
In [ ]: